import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from warnings import filterwarnings
filterwarnings('ignore')
df= pd.read_csv(r'C:\Users\Administrator\Documents\Data Analytics Real World Project -Python\3-Zomato Data Analysis/zomato.csv')
df.head()
df.shape
df.dtypes
len(df['name'].unique())
#checking null values
df.isna().sum()
features_na=[feature for feature in df.columns if df[feature].isna().sum() > 0]
features_na
for feature in features_na:
print(' {} has {}% missing values '.format(feature,np.round(df[feature].isna().sum()/len(df)*100,4)))
df['rate'].unique()
df.dropna(axis='index',inplace=True,subset=['rate'])
df['rate'].unique()
df.shape
def split(x):
return x.split('/')[0]
df['rate']=df['rate'].apply(split)
df['rate'].unique()
df.replace('NEW',0,inplace=True)
df.replace('-',0,inplace=True)
df['rate'].unique()
df['rate']=df['rate'].astype(float)
df['rate'].dtype
df.groupby('name')['rate'].mean().nlargest(20).plot.bar()
df_rate=df.groupby('name')['rate'].mean().to_frame()
df_rate=df_rate.reset_index()
df_rate.columns=['Restaurant','Rating']
df_rate.head(20)
df_rate.shape
sns.set_style(style='whitegrid')
sns.distplot(df_rate['Rating'])
plt.figure(figsize=(10,7))
chains=df['name'].value_counts()[0:20]
sns.barplot(x=chains,y=chains.index,palette='deep')
plt.title("Most famous restaurants chains in Bangaluru")
plt.xlabel("Number of outlets")
fig1=px.bar(x=chains,y=chains.index)
fig1.show()
x= df['online_order'].value_counts()
labels=['Accepted','Not Accepted']
plt.pie(x,explode=[0.0,0.1],autopct='%1.1f%%')
Plotting using plotly
import plotly.express as px
x= df['online_order'].value_counts()
labels=['Accepted','Not Accepted']
fig=px.pie(df,values=x,names=labels,title='Pie Chart for Online Order Acceptance')
fig.show()
x= df['book_table'].value_counts()
labels=['Allowed','Not Allowed']
fig=px.pie(df,values=x,names=labels,title='Pie Chart for Online Booking tables')
fig.show()
import plotly.graph_objs as go
from plotly.offline import iplot
x= df['book_table'].value_counts()
labels=['Allowed','Not Allowed']
trace=go.Pie(labels=labels,values=x,
hoverinfo='label+value',textinfo='percent',
textfont=dict(size=25),
pull=[0, 0, 0,0.2, 0]
)
iplot([trace])
df['rest_type'].isna().sum()
df['rest_type'].dropna(inplace=True)
df['rest_type'].isna().sum()
df['rest_type'].describe()
plt.figure(figsize=(20,12))
df['rest_type'].value_counts().nlargest(20).plot.bar(color='red')
plt.gcf().autofmt_xdate()
trace1= go.Bar(
x=df['rest_type'].value_counts().nlargest(20).index,
y=df['rest_type'].value_counts().nlargest(20),
name='rest_type')
iplot([trace1])
df.groupby('name')['votes'].max().nlargest(20).plot.bar(color='red')
Now using plotly
trace2=go.Bar(
x=df.groupby('name')['votes'].max().nlargest(20).index,
y=df.groupby('name')['votes'].max().nlargest(20),
name='name'
)
iplot([trace2])
df.groupby('location')['name'].unique()
restaurant=[]
location=[]
for key,location_df in df.groupby('location'):
location.append(key)
restaurant.append(len(location_df['name'].unique()))
df_total=pd.DataFrame(zip(location,restaurant))
df_total.columns=['location','restaurant']
df_total.set_index('location',inplace=True)
df_total.sort_values(by='restaurant').tail(10)
df_total.sort_values(by='restaurant').tail(10).plot.bar(color='purple')
trace3=go.Bar(
x=df_total['restaurant'].nlargest(10).index,
y=df_total['restaurant'].nlargest(10),
name='restaurant'
)
iplot([trace3])
df.isnull().sum()
#Representing different type of restaurant data in Percentage
((df['rest_type'].value_counts()/len(df))*100).nlargest(10).plot.bar(color='red')
cuisines=df['cuisines'].value_counts()[:10]
trace4=go.Bar(x=cuisines.index,
y=cuisines,
name='cuisines')
iplot([trace4])
len(df['approx_cost(for two people)'].value_counts())
df['approx_cost(for two people)'].isna().sum()
df.dropna(axis='index',subset=['approx_cost(for two people)'],inplace=True)
df['approx_cost(for two people)'].isna().sum()
df['approx_cost(for two people)'].unique()
df['approx_cost(for two people)'].dtype
df['approx_cost(for two people)']=df['approx_cost(for two people)'].apply(lambda x: x.replace(',',''))
df['approx_cost(for two people)'].unique()
df['approx_cost(for two people)']=df['approx_cost(for two people)'].astype(int)
df['approx_cost(for two people)'].dtype
sns.distplot(df['approx_cost(for two people)'])
# above graph shows that most of the restaurant's approx cost for 2 is nearly 1000
plt.figure(figsize=(12,8))
sns.scatterplot(x="rate",y='approx_cost(for two people)',hue='online_order',data=df)
plt.show()
sns.boxplot(x='online_order',y='votes',data=df)
fig = px.box(df,x='online_order',y='votes')
fig.show()
from this boxplot,we can observe that median number of votes for both categories vary. Restaurants accepting online orders get more votes from customers as there is a rating option poping up after each order through zomato application.
sns.boxplot(x='online_order',y='approx_cost(for two people)',data=df)
fig = px.box(df,x='online_order',y='approx_cost(for two people)')
fig.show()
df['approx_cost(for two people)'].min()
df[df['approx_cost(for two people)']==40]
df['approx_cost(for two people)'].max()
df[df['approx_cost(for two people)']==6000]
plt.figure(figsize=(15,9))
sns.histplot(df,x='approx_cost(for two people)')
plt.show()
px.histogram(df, x="approx_cost(for two people)")
df[df['approx_cost(for two people)']==6000].loc[:,('name','cuisines','dish_liked')]
df[df['approx_cost(for two people)']==6000][['name','cuisines','dish_liked']]
data=df.copy()
data.dtypes
data.set_index('name',inplace=True)
### Top 10 Most Expensive restaurant with approx cost for 2 people
data['approx_cost(for two people)'].nlargest(10).plot.bar()
trace1=go.Bar(
x=data['approx_cost(for two people)'].nlargest(10).index,
y=data['approx_cost(for two people)'].nlargest(10),
name='Priority'
)
iplot([trace1])
data['approx_cost(for two people)'].nsmallest(10).plot.bar()
trace1=go.Bar(
x=data['approx_cost(for two people)'].nsmallest(10).index,
y=data['approx_cost(for two people)'].nsmallest(10),
name='Priority'
)
iplot([trace1])
#data.set_index('location',inplace=True)
data['approx_cost(for two people)'].nsmallest(20).plot.bar()
data['approx_cost(for two people)'].nlargest(20).plot.bar()
df[(df['rate']>=4) & (df['approx_cost(for two people)']<=500)].shape
df_new=df[(df['rate']>=4) & (df['approx_cost(for two people)']<=500)]
len(df_new['name'].unique())
location=[]
total=[]
for loc,location_df in df_new.groupby('location'):
location.append(loc)
total.append(len(location_df['name'].unique()))
len(location)
len(total)
location_df=pd.DataFrame(zip(location,total))
location_df.columns=['location','restaurants']
location_df.set_index('location',inplace=True)
location_df
location_df['restaurants'].nlargest(10).plot.bar()
plt.gcf().autofmt_xdate()
plt.ylabel('Total restaurants')
trace1 = go.Bar(
x = location_df['restaurants'].nlargest(10).index,
y = location_df['restaurants'].nlargest(10),
name= 'Priority')
iplot([trace1])
Creating a function which takes restaurant type and location and return names .
def return_budget(location,restaurant):
budget=df[(df['approx_cost(for two people)']<=400) & (df['location']==location) &
(df['rate']>4) & (df['rest_type']==restaurant)]
return(budget['name'].unique())
return_budget('BTM',"Quick Bites")
plt.figure(figsize=(10,7))
Restaurant_locations=df['location'].value_counts()[:20]
sns.barplot(Restaurant_locations,Restaurant_locations.index)
Restaurant_locations=df['location'].value_counts()[:20]
trace1 = go.Bar(
x = Restaurant_locations.index,
y = Restaurant_locations,
name= 'Priority')
iplot([trace1])
To do so we need longitudes and lattitudes of any location, hence we will use Geopy
df.shape
len(df['location'].unique())
locations=pd.DataFrame({"Name":df['location'].unique()})
locations['new_Name']='Bangalore '+locations['Name'] #Adding bangalore to the name
locations.head()
!pip install geopy
from geopy.geocoders import Nominatim
lat_lon=[]
geolocator=Nominatim(user_agent="app")
for location in locations['Name']:
location = geolocator.geocode(location)
if location is None:
lat_lon.append(np.nan)
else:
geo=(location.latitude,location.longitude)
lat_lon.append(geo)
locations['geo_loc']=lat_lon
locations.head()
locations.to_csv('zomato_locations.csv',index=False)
Rest_locations=pd.DataFrame(df['location'].value_counts().reset_index())
Rest_locations.columns=['Name','count']
Rest_locations.head()
locations.head()
locations.shape
Rest_locations.shape
Restaurant_locations=Rest_locations.merge(locations,on='Name',how="left").dropna()
Restaurant_locations.head()
Restaurant_locations.shape
Restaurant_locations['count'].max()
type(Restaurant_locations['geo_loc'][0])
def generateBaseMap(default_location=[12.97, 77.59], default_zoom_start=12):
base_map = folium.Map(location=default_location, zoom_start=default_zoom_start)
return base_map
len(Restaurant_locations['geo_loc'])
Restaurant_locations.isna().sum()
Restaurant_locations['geo_loc'][0][0]
Restaurant_locations['geo_loc'][0][1]
np.array(Restaurant_locations['geo_loc'])
#### unzip it
lat,lon=zip(*np.array(Restaurant_locations['geo_loc']))
type(lat)
Restaurant_locations['lat']=lat
Restaurant_locations['lon']=lon
Restaurant_locations.head()
!pip install folium
import folium
from folium.plugins import HeatMap
basemap=generateBaseMap()
basemap
Restaurant_locations[['lat','lon','count']].values.tolist()
HeatMap(Restaurant_locations[['lat','lon','count']].values.tolist(),zoom=20,radius=15).add_to(basemap)
basemap
df2= df[df['cuisines']=='North Indian']
df2.head()
north_india=df2.groupby(['location'],as_index=False)['url'].agg('count')
north_india.columns=['Name','count']
north_india.head()
north_india=north_india.merge(locations,on="Name",how='left').dropna()
north_india['lan'],north_india['lon']=zip(*north_india['geo_loc'].values)
north_india.drop(['geo_loc'],axis=1)
basemap=generateBaseMap()
HeatMap(north_india[['lan','lon','count']].values.tolist(),zoom=20,radius=15).add_to(basemap)
basemap
df.head()
df['reviews_list'][0]
data=df['reviews_list'][0].lower()
data
import re
data2=re.sub('[^a-zA-Z]', ' ',data)
data2
data3=re.sub('rated', ' ',data2)
data3
data4=re.sub('x',' ',data3)
data4
re.sub(' +',' ',data4)
dataset=df[df['rest_type']=='Quick Bites']
type(dataset['reviews_list'][3])
total_review=' '
for review in dataset['reviews_list']:
review=review.lower()
review=re.sub('[^a-zA-Z]', ' ',review)
review=re.sub('rated', ' ',review)
review=re.sub('x',' ',review)
review=re.sub(' +',' ',review)
total_review=total_review + str(review)
from wordcloud import WordCloud, STOPWORDS
stopwords=set(STOPWORDS)
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(total_review)
# plot the WordCloud image
plt.figure(figsize = (8, 8))
plt.imshow(wordcloud)
plt.axis("off")
def importance(restaurant):
dataset=df[df['rest_type']==restaurant]
total_review=' '
for review in dataset['reviews_list']:
review=review.lower()
review=re.sub('[^a-zA-Z]', ' ',review)
review=re.sub('rated', ' ',review)
review=re.sub('x',' ',review)
review=re.sub(' +',' ',review)
total_review=total_review + str(review)
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = set(STOPWORDS),
min_font_size = 10).generate(total_review)
# plot the WordCloud image
plt.figure(figsize = (8, 8))
plt.imshow(wordcloud)
plt.axis("off")
importance('Quick Bites')